library(tidyverse)
library(car)
library(performance)
library(patchwork)
# load libraries
library(tidyverse) # for all things!
library(psych) # good for descriptive stats
library(patchwork) # grouping plots together
library(kableExtra) # useful for creating nice tables
library(sjPlot) #regression tables & plots
library(emmeans) #for contrasts
library(car) #for assumptions (crPlots, residualPlots, VIF) and bootstrapping
# read in datasets
data1 <- read_csv("https://uoepsy.github.io/data/DapR2_S1B2_PracticalPart1.csv")
data2 <- read_csv("https://uoepsy.github.io/data/DapR2_S1B2_PracticalPart2.csv")
data1
tibble(
Variable = names(data1),
Description = c("Participant ID number", "Total attendance (in days)", "Conscientiousness (Levels: Low, Moderate, High)", "Time of Class (Levels: 9AM, 10AM, 11AM, 12PM, 1PM, 2PM, 3PM, 4PM)", "Frequency of access to online course materials (Levels: Rarely, Sometimes, Often)", "Year of Study in University (Y1, Y2, Y3, Y4, MSc, PhD)")
) %>% gt::gt()
| Variable | Description |
|---|---|
| pid | Participant ID number |
| Attendance | Total attendance (in days) |
| Conscientiousness | Conscientiousness (Levels: Low, Moderate, High) |
| Time | Time of Class (Levels: 9AM, 10AM, 11AM, 12PM, 1PM, 2PM, 3PM, 4PM) |
| OnlineAccess | Frequency of access to online course materials (Levels: Rarely, Sometimes, Often) |
| Year | Year of Study in University (Y1, Y2, Y3, Y4, MSc, PhD) |
Use this to address RQs 1 and 2.
RQ1 uses the variables Conscientiousness,
OnlineAccess, and Year and looks at their
associations with Attendance. (… strictly speaking this
should be a Poisson model, not a Gaussian)
RQ2 looks at association between Time and
Attendance.
tibble(
Variable = names(data2),
Description = c("Final grade (0-100)", "Total attendance (in days)")
) %>% gt::gt()
| Variable | Description |
|---|---|
| Marks | Final grade (0-100) |
| Attendance | Total attendance (in days) |
Use this to address RQ 3.
#######
# Coding of Variables
#######
#check coding
str(data1)
spc_tbl_ [397 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ pid : num [1:397] 1 2 3 4 5 6 7 8 9 10 ...
$ Attendance : num [1:397] 9 10 0 8 6 6 9 14 5 10 ...
$ Conscientiousness: chr [1:397] "High" "High" "Low" "Low" ...
$ Time : chr [1:397] "3PM" "2PM" "10AM" "4PM" ...
$ OnlineAccess : chr [1:397] "Often" "Often" "Rarely" "Often" ...
$ Year : chr [1:397] "Y3" "Y3" "Y2" "Y4" ...
- attr(*, "spec")=
.. cols(
.. pid = col_double(),
.. Attendance = col_double(),
.. Conscientiousness = col_character(),
.. Time = col_character(),
.. OnlineAccess = col_character(),
.. Year = col_character()
.. )
- attr(*, "problems")=<externalptr>
str(data2)
spc_tbl_ [200 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Marks : num [1:200] 25.2 25.8 25.4 26.4 27.4 ...
$ Attendance: num [1:200] 10.5 11 11.5 12 12.5 13 13.5 14 14.5 15 ...
- attr(*, "spec")=
.. cols(
.. Marks = col_double(),
.. Attendance = col_double()
.. )
- attr(*, "problems")=<externalptr>
#check for NAs - none in dataset, so no missing values
table(is.na(data1))
FALSE
2382
table(is.na(data2))
FALSE
400
# make variables factors
data1 <- data1 |>
mutate(OnlineAccess = as_factor(OnlineAccess),
Time = as_factor(Time),
Conscientiousness = as_factor(Conscientiousness),
Year = as_factor(Year))
#specify reference levels (alternatively use the below tidyverse way like Year - see lecture example code)
data1$OnlineAccess <- relevel(data1$OnlineAccess, "Sometimes")
data1$Conscientiousness <- relevel(data1$Conscientiousness, "Moderate")
#ordering of year variable - make chronological, Y1 as reference group
data1$Year <- data1$Year |>
factor(levels = c('Y1', 'Y2', 'Y3', 'Y4', 'MSc', 'PhD'))
# Look at the marginal distributions of variables - use histograms for continuous outcomes, and barplots for categorical:
p1 <- ggplot(data1, aes(Attendance)) +
geom_histogram() +
labs(x = "Attendance", y = "Frequency")
p2 <- ggplot(data1, aes(Conscientiousness)) +
geom_bar() +
labs(x = "Conscientiousness Level", y = "Frequency")
p3 <- ggplot(data1, aes(Year)) +
geom_bar() +
labs(x = "Year of Study", y = "Frequency")
p4 <- ggplot(data1, aes(OnlineAccess)) +
geom_bar() +
labs(x = "Frequency of Access to Online Materials", y = "Frequency")
p1 / p2 / p3 / p4
# Look at the bivariate associations (note we are also removing the legend - it does not offer the reader any additional information and takes up space):
p5 <- ggplot(data1, aes(x = Conscientiousness, y = Attendance, fill = Conscientiousness)) +
geom_boxplot() +
labs(x = "Conscientiousness Level", y = "Attendance") +
theme(legend.position = "none")
p6 <- ggplot(data1, aes(x = OnlineAccess, y = Attendance, fill = OnlineAccess)) +
geom_boxplot() +
labs(x = "Frequency of Access to Online Materials", y = "Attendance") +
theme(legend.position = "none")
p7 <- ggplot(data1, aes(x = Year, y = Attendance, fill = Year)) +
geom_boxplot() +
labs(x = "Year of Study", y = "Attendance") +
theme(legend.position = "none")
p5 / p6 / p7
# check how many observations in each category
table(data1$Conscientiousness)
Moderate High Low
146 124 127
table(data1$OnlineAccess)
Sometimes Often Rarely
170 126 101
table(data1$Year)
Y1 Y2 Y3 Y4 MSc PhD
89 100 66 71 48 23
# data1 |>
# group_by(Year, OnlineAccess, Conscientiousness) |>
# summarise(n = n(),
# Mean = mean(Attendance),
# SD = sd(Attendance),
# Minimum = min(Attendance),
# Maximum = max(Attendance)) |>
# kable(caption = "Attendance and Academic Year, Frequency of Online Material Access, Conscientiousness Descriptive Statistics", digits = 2) %>%
# kable_styling()
^ the kable table is bad.
Visualise Attendance
#build model
m1 <- lm(Attendance ~ Conscientiousness + OnlineAccess + Year, data = data1)
#check model summary
summary(m1)
Call:
lm(formula = Attendance ~ Conscientiousness + OnlineAccess +
Year, data = data1)
Residuals:
Min 1Q Median 3Q Max
-37.657 -6.990 -0.279 6.085 31.844
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 27.874 1.533 18.179 < 2e-16 ***
ConscientiousnessHigh 7.366 1.392 5.292 2.03e-07 ***
ConscientiousnessLow -10.292 1.399 -7.359 1.12e-12 ***
OnlineAccessOften -3.533 1.339 -2.639 0.008649 **
OnlineAccessRarely -5.378 1.441 -3.732 0.000218 ***
YearY2 4.574 1.657 2.760 0.006049 **
YearY3 3.418 1.853 1.844 0.065926 .
YearY4 4.266 1.817 2.347 0.019418 *
YearMSc 5.649 2.046 2.760 0.006051 **
YearPhD 12.484 2.661 4.692 3.76e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 11.35 on 387 degrees of freedom
Multiple R-squared: 0.3574, Adjusted R-squared: 0.3424
F-statistic: 23.91 on 9 and 387 DF, p-value: < 2.2e-16
# Linearity: Can be assumed as working with categorical predictors
# Independence of Errors: Using a between-subjects design, so can assume this
# Normality (either use plot(model, which = 2) or hist(model$residuals))
plot(m1, which = 2, main = "Normality Assumption Check for m1")
# Equal Variances
residualPlot(m1, main = "Equal Variances Assumption Check for m1")
#### Overall, assumption checks look fine
tab_model(m1,
pred.labels = c('Intercept', 'Conscientiousness - High', 'Conscientiousness - Low', 'Online Access - Often', 'Online Access - Rarely',
'UG Y2', 'UG Y3', 'UG Y4', 'MSc', 'PhD'),
title = "RQ1: Regression Table for Attendance Model")
#ordering of time variable - make chronological
data1$Time <- data1$Time |>
factor(levels = c('9AM', '10AM', '11AM','12PM', '1PM', '2PM', '3PM', '4PM'))
# Numeric
data1 |>
group_by(Time) |>
summarise(n = n(),
Mean = mean(Attendance),
SD = sd(Attendance),
Minimum = min(Attendance),
Maximum = max(Attendance)) |>
kable(caption = "Attendance & Class Time Descriptive Statistics", digits = 2) |>
kable_styling()
| Time | n | Mean | SD | Minimum | Maximum |
|---|---|---|---|---|---|
| 9AM | 56 | 20.12 | 10.08 | 1 | 49 |
| 10AM | 48 | 27.00 | 14.23 | 0 | 60 |
| 11AM | 46 | 27.78 | 14.57 | 2 | 52 |
| 12PM | 47 | 31.30 | 14.11 | 0 | 55 |
| 1PM | 45 | 33.47 | 14.44 | 4 | 60 |
| 2PM | 46 | 32.43 | 12.44 | 0 | 60 |
| 3PM | 52 | 31.67 | 13.58 | 5 | 54 |
| 4PM | 57 | 24.75 | 13.78 | 4 | 54 |
# check how many observations in each category
table(data1$Time)
9AM 10AM 11AM 12PM 1PM 2PM 3PM 4PM
56 48 46 47 45 46 52 57
# Visual
p8 <- ggplot(data1, aes(Time)) +
geom_bar()
p9 <- ggplot(data1, aes(x = Time, y = Attendance, fill = Time)) +
geom_boxplot()
p8 / p9
#build model
m2 <- lm(Attendance ~ Time, data = data1)
#check summary
summary(m2)
Call:
lm(formula = Attendance ~ Time, data = data1)
Residuals:
Min 1Q Median 3Q Max
-32.435 -10.298 1.327 10.246 33.000
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.125 1.793 11.226 < 2e-16 ***
Time10AM 6.875 2.639 2.605 0.00953 **
Time11AM 7.658 2.669 2.869 0.00435 **
Time12PM 11.173 2.654 4.210 3.17e-05 ***
Time1PM 13.342 2.686 4.968 1.02e-06 ***
Time2PM 12.310 2.669 4.611 5.43e-06 ***
Time3PM 11.548 2.583 4.470 1.03e-05 ***
Time4PM 4.629 2.524 1.834 0.06740 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 13.41 on 389 degrees of freedom
Multiple R-squared: 0.0974, Adjusted R-squared: 0.08116
F-statistic: 5.997 on 7 and 389 DF, p-value: 1.196e-06
# Linearity: Can be assumed as working with categorical predictors
# Independence of Errors: Using a between-subjects design, so can assume this
# Normality (either use plot(model, which = 2) or hist(model$residuals))
plot(m2, which = 2)
# Equal Variances
residualPlot(m2)
#### Overall, assumption checks look fine
#Morning/Evening vs Afternoon
#check order
levels(data1$Time)
[1] "9AM" "10AM" "11AM" "12PM" "1PM" "2PM" "3PM" "4PM"
#table of weights to present in table 1 analysis strategy
TimePeriod <- c("Early/Late", "Early/Late", "Midday", "Midday", "Midday", "Midday", "Early/Late", "Early/Late")
Time <- c("9AM", "10AM", "11AM", "12PM", "1PM", "2PM", "3PM", "4PM")
Weight <- c(1/4, 1/4, -1/4, -1/4, -1/4, -1/4, 1/4, 1/4)
weights <- tibble(TimePeriod, Time, Weight)
#get means
time_mean <- emmeans(m2, ~Time)
#look at means
time_mean
Time emmean SE df lower.CL upper.CL
9AM 20.1 1.79 389 16.6 23.6
10AM 27.0 1.94 389 23.2 30.8
11AM 27.8 1.98 389 23.9 31.7
12PM 31.3 1.96 389 27.5 35.1
1PM 33.5 2.00 389 29.5 37.4
2PM 32.4 1.98 389 28.5 36.3
3PM 31.7 1.86 389 28.0 35.3
4PM 24.8 1.78 389 21.3 28.2
Confidence level used: 0.95
#plot means
plot(time_mean)
#specify weights for contrast
time_comp <- list('Early or Late vs Middle of the Day' = c(-1/4,-1/4, 1/4, 1/4, 1/4, 1/4, -1/4, -1/4))
#run contrast analysis
time_comp_test <- contrast(time_mean, method = time_comp)
#examine output
time_comp_test
contrast estimate SE df t.ratio p.value
Early or Late vs Middle of the Day 5.36 1.35 389 3.963 0.0001
#obtain confidence intervals
confint(time_comp_test)
contrast estimate SE df lower.CL upper.CL
Early or Late vs Middle of the Day 5.36 1.35 389 2.7 8.01
Confidence level used: 0.95
I guess it’s fine
data2 |>
describe() |>
select(2:4, 8:9) |>
rename("N" = n, "Mean" = mean, "SD" = sd, "Minimum" = min, "Maximum" = max) |>
kable(caption = "Final Grades & Attendance Descriptive Statistics", digits = 2) |>
kable_styling()
| N | Mean | SD | Minimum | Maximum | |
|---|---|---|---|---|---|
| Marks | 200 | 49.79 | 15.84 | 25.01 | 98.2 |
| Attendance | 200 | 35.25 | 14.47 | 10.50 | 60.0 |
data2 |>
select(Attendance, Marks) |>
cor() |>
round(digits = 2)
Attendance Marks
Attendance 1.00 0.91
Marks 0.91 1.00
ggplot(data = data2, aes(x = Attendance, y = Marks)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Attendance (in days)", y = "Final Grade")
#specify model
m3 <- lm(Marks ~ Attendance, data = data2)
#check summary
summary(m3)
Call:
lm(formula = Marks ~ Attendance, data = data2)
Residuals:
Min 1Q Median 3Q Max
-17.1477 -4.5210 -0.1861 4.2501 26.8415
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.83270 1.25534 11.82 <2e-16 ***
Attendance 0.99163 0.03296 30.09 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.727 on 198 degrees of freedom
Multiple R-squared: 0.8205, Adjusted R-squared: 0.8196
F-statistic: 905.3 on 1 and 198 DF, p-value: < 2.2e-16
# Linearity (can also use plot(model, which = 1) in place of below)
ggplot(data2, aes(x = Attendance, y = Marks)) +
geom_point() +
geom_smooth(method = 'lm', se = F) +
geom_smooth(method = 'loess', se = F, colour = 'red') +
labs(x = "Attendance", y = "Final Grade", title = "Scatterplot with linear (blue) and loess (red) lines")
# Independence of Errors: Using a between-subjects design, so can assume this
# Normality (either use plot(model, which = 2) or hist(model$residuals))
plot(m3, which = 2)
# Equal Variances
residualPlot(m3)
# use 1000 resamples
boot_m3 <- Boot(m3, R = 1000)
G3;Loading required namespace: boot
g
#check summary
summary(boot_m3)
Number of bootstrap replications R = 1000
original bootBias bootSE bootMed
(Intercept) 14.83270 -0.0462723 1.042487 14.78770
Attendance 0.99163 0.0019082 0.036449 0.99322
#confidence intervals
confint(boot_m3)
Bootstrap bca confidence intervals